Our design is a Raspberry Pi-based intelligent assistant that can do freestyle rapping about a certain topic given by the voice command from the user. It will automatically detect the trigger word(its name "Andrew"), transform voice collected from microphone to text, and then understand the topic from a natural language sentence, generate related lyrics and background beat, and eventually play them with the speaker. We also implemented the screen display, which could show the dialogue content and signal waves as background. And finally we optimize the dialogue-based interaction so that it will speak out the current weather of a given location. It is an embedded device with microphone and speaker as input and output, and can interact with users using voice and language processing algorithms.
      







We basically followed our expected time schedule and accomplished the basic functions we proposed in the first proposal. In addition, this freestyle Pi can implement some dialogue-based interactions, for example, it can tell you the weather of a given location and the most handsome man in the world, so we consider this project as a success. Future work is needed to make this voice assistant more intelligent.

        
        xf78@cornell.edu
Designed the overall software architecture (Just being himself).
        yz2453@cornell.edu
Lyrics and Beat Generation
Figure Design and Vedio Edition
| Parts | From | Cost | 
|---|---|---|
| Raspberry Pi | Lab | $0.00 | 
| Speaker | Lab | $0.00 | 
| PS3 Eye Microphone | Amazon | $8.53 | 
        The Largest Vocabulary In Music
        Keras LSTM
          example
        Deep Learning AI Specialization
        Pyalsaaudio Library
        Mycroft-precise
      
    import sys
    import time
    import random
    import queue
    import threading
    
    from termcolor import cprint
    from utils.audio import ResumableMicrophoneStream
    from utils.detect_queue import DetectQueue
    from utils.credentials import init_credentials
    
    from trigger_detector import TriggerDetector
    
    from speech_to_text import SpeechToText
    from lang_understand import LangUnderstand
    from text_to_speech import TextToSpeech
    from lyrics_generator import LyricsGenerator
    
    from tft_display import TFTDisplay
    
    # Audio recording parameters
    SAMPLE_RATE = 16000
    CHUNK_SIZE = int(SAMPLE_RATE / 10)  # 100ms
    STREAM_LIMIT = 5000
    
    
    class Andrew(object):
        """the rap voice assisstant
        """
        def __init__(self, detect_model="data/andrew2.net",
                            lyrics_model="data/keras_model_1200.h5",
                            lyrics_chars="data/chars.pkl"):
            # microphone
            self.mic = ResumableMicrophoneStream(SAMPLE_RATE, CHUNK_SIZE)
    
            # wake word detector
            self.detector = TriggerDetector(detect_model)
    
            # speech and language services
            self.speech_client = SpeechToText()
            self.luis = LangUnderstand()
            self.tts = TextToSpeech()
    
            # lyrics generator model
            self.lyrics_gen = LyricsGenerator(lyrics_model, lyrics_chars)
    
            self.pred_queue = DetectQueue(maxlen=5)
            self.is_wakeup = False
    
            # pytft display
            self.tft = TFTDisplay()
            self.tft_queue = queue.Queue()
            self.tft_thread = threading.Thread(target=self.tft_manage, args=())
            self.tft_thread.daemon = True
            self.tft_thread.start()
    
            self.notify("hi_there")
    
    
        def notify(self, topic="hi_there", is_async=False, audio_path="data/audio"):
            # Notify with local preset audio files
            from os.path import join, isfile
            audio_file = join(audio_path, f"{topic}.wav")
            if not isfile(audio_file):
                return
    
            self.tts.play_file(audio_file, is_async)
    
    
        def generate_rap(self, topic="", beat_path="data/beat"):
            """Generate rap and play
            """
            tts = self.tts
            lyrics_gen = self.lyrics_gen
    
            response = tts.generate_speech(f"hey, I can rap about {topic}")
            tts.play(response, True)
    
            # Generate based on topic
            lyrics_output = lyrics_gen.generate(topic)
    
            # Generate speech
            lyrics_speech = tts.generate_speech(lyrics_output)
    
            # Select beat
            beat_index = random.randint(0, 20)
    
            # Play beat and lyrics
            tts.play_file(f'{beat_path}/beat_{beat_index}.wav', True)
            tts.play(lyrics_speech)
    
        def get_weather_message(self, city="Ithaca"):
            import requests, json, os
            api_key = os.getenv('WEATHER_APIKEY')
            base_url = "https://api.openweathermap.org/data/2.5/weather?"
            city_name = f"{city},us"
            complete_url = f"{base_url}q={city_name}&units=imperial&APPID={api_key}"
            try:
                response = requests.get(complete_url)
                res = response.json()
                msg_weather = f"Today, it's {res['weather'][0]['description']} in {city}. "
                msg_temp = f"The temperature is {int(res['main']['temp'])} degrees."
                return msg_weather + msg_temp
            except:
                pass
    
            return ""
    
    
        def intent_recognize(self, text=""):
            """Recognize intent
            """
            luis = self.luis
            tts = self.tts
    
            # Get result from language understanding engine
            luis_result = luis.predict(text)
            intent = luis_result.top_scoring_intent.intent
    
            if intent == "Freestyle":
                entities = luis_result.entities
                entity_topic = "rap"
                if (len(entities) > 0):
                    entity = entities[0]
                    cprint(f'The topic is {entity.entity}', 'cyan')
                    entity_topic = entity.entity
                self.generate_rap(entity_topic)
    
            elif intent == "Weather":
                response = tts.generate_speech("I will tell you the weather in Ithaca.")
                tts.play(response)
    
                weather = self.get_weather_message()
                response = tts.generate_speech(weather)
                tts.play(response)
    
            else:
                self.notify("sorry")
    
    
        def tft_manage(self):
            """Manage TFT display through state
            """
            self.tft.display_text("Andrew is waking up")
            status = {'state': 'None'}
    
            while True:
                if status['state'] is 'wait':
                    self.tft.display_wave()
    
                elif status['state'] is 'listen':
                    self.tft.display_wave((0, 255, 0))
    
                # Update the status
                try:
                    update = self.tft_queue.get(block=False)
                    if update is not None:
                        status = update
    
                except queue.Empty:
                    continue
    
    
        def start(self):
            """Start listening and interacting
            """
            tft = self.tft
            tts = self.tts
    
            # Init stream
            with self.mic as stream:
    
                self.tft_queue.put({'state': 'listen'})
    
                while True:
                    if not self.is_wakeup:
                        stream.closed = False
    
                        while not stream.closed:
    
                            stream.audio_input = []
                            audio_gen = stream.generator()
    
                            for chunk in audio_gen:
                                if not self.is_wakeup:
    
                                    prob = self.detector.get_prediction(chunk)
    
                                    self.pred_queue.append(prob > 0.6)
                                    print('!' if prob > 0.6 else '.', end='', flush=True)
    
                                    if (self.pred_queue.count >= 2):
                                        self.notify("hi")
                                        cprint(' Trigger word detected! \n', 'magenta')
                                        self.pred_queue.clear()
                                        self.is_wakeup = True
                                        stream.pause()
                                        break
                    else:
                        cprint('Speech to text\n', 'green')
    
                        time.sleep(1)
                        stream.closed = False
    
                        try:
                            voice_command = self.speech_client.recognize(stream)
    
                            cprint(f'{voice_command}\n', 'yellow')
                            cprint('Recognition ended...\n', 'red')
    
                            stream.pause()
    
                            #tft.display_text(f'"{voice_command}"')
    
                            if ("goodbye" in voice_command):
                                self.notify("see_you")
                                exit()
    
                            if ("sorry" in voice_command):
                                self.notify("its_ok")
    
                            else:
                                cprint('Recognize intents...', 'cyan')
                                self.intent_recognize(voice_command)
    
                        except Exception as e:
                            cprint(f'Error: {e}', 'red')
    
                        self.is_wakeup = False
    
    
    def main():
    
        # set credentials for cloud services
        init_credentials()
    
        # init and start andrew
        andrew = Andrew()
        andrew.start()
    
    
    if __name__ == "__main__":
        main()